Combine data files with state populations

The first data file has 2000-2010


In [2]:
%matplotlib inline
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import numpy as np
import os
from os.path import join
cwd = os.getcwd()
data_directory = join(cwd, '..', 'Data storage')

Import 2000-2010 data

The sex, origin, race, and age columns are ALL when they have values of 0

Not clear if these are beginning or end of year values.

https://www2.census.gov/programs-surveys/popest/datasets/2000-2010/intercensal/state/


In [3]:
path = os.path.join(data_directory, 'Population data',
                    'st-est00int-alldata.csv')
pop1 = pd.read_csv(path)

In [4]:
pop1.head()


Out[4]:
REGION DIVISION STATE NAME SEX ORIGIN RACE AGEGRP ESTIMATESBASE2000 POPESTIMATE2000 ... POPESTIMATE2002 POPESTIMATE2003 POPESTIMATE2004 POPESTIMATE2005 POPESTIMATE2006 POPESTIMATE2007 POPESTIMATE2008 POPESTIMATE2009 CENSUS2010POP POPESTIMATE2010
0 0 0 0 United States 0 0 0 0 281424600 282162411 ... 287625193 290107933 292805298 295516599 298379912 301231207 304093966 306771529 308745538 309349689
1 0 0 0 United States 0 0 0 1 19176154 19178293 ... 19429192 19592446 19785885 19917400 19938883 20125962 20271127 20244518 20201362 20200529
2 0 0 0 United States 0 0 0 2 20549855 20463852 ... 19872417 19620851 19454237 19389067 19544688 19714611 19929602 20182499 20348657 20382409
3 0 0 0 United States 0 0 0 3 20528425 20637696 ... 21261421 21415353 21411680 21212579 21033138 20841042 20706655 20660564 20677194 20694011
4 0 0 0 United States 0 0 0 4 20218782 20294955 ... 20610370 20797166 21102552 21486214 21807709 22067816 22210880 22192810 22040343 21959087

5 rows × 21 columns


In [5]:
pop1 = pop1.loc[(pop1['SEX'] == 0) &
                (pop1['ORIGIN'] == 0) &
                (pop1['RACE'] == 0) &
                (pop1['AGEGRP'] == 0), :]

In [6]:
# Column names for population estimate
est_cols = ['POPESTIMATE{}'.format(x) for x in range(2000, 2011)]

In [7]:
# Melt the wide-form data into a tidy dataframe
pop1_tidy = pd.melt(pop1, id_vars='NAME',
                    value_vars=est_cols, var_name='Year',
                    value_name='Population')

In [8]:
pop1_tidy.head()


Out[8]:
NAME Year Population
0 United States POPESTIMATE2000 282162411
1 Alabama POPESTIMATE2000 4452173
2 Alaska POPESTIMATE2000 627963
3 Arizona POPESTIMATE2000 5160586
4 Arkansas POPESTIMATE2000 2678588

In [9]:
def map_year(x):
    'Return last 4 characters (the year)'
    year = x[-4:]
    return int(year)

pop1_tidy['Year'] = pop1_tidy['Year'].map(map_year)

The values shown below are ever slightly different than those listed in the later dataset.


In [10]:
pop1_tidy.loc[pop1_tidy['Year'] == 2010].head()


Out[10]:
NAME Year Population
520 United States 2010 309349689
521 Alabama 2010 4785298
522 Alaska 2010 713985
523 Arizona 2010 6413737
524 Arkansas 2010 2921606

In [11]:
pop1_tidy.head()


Out[11]:
NAME Year Population
0 United States 2000 282162411
1 Alabama 2000 4452173
2 Alaska 2000 627963
3 Arizona 2000 5160586
4 Arkansas 2000 2678588

In [12]:
pop1_tidy.tail()


Out[12]:
NAME Year Population
567 Virginia 2010 8024617
568 Washington 2010 6744496
569 West Virginia 2010 1853973
570 Wisconsin 2010 5691047
571 Wyoming 2010 564460

In [13]:
pop1_tidy.columns = ['State', 'Year', 'Population']

In [14]:
path = os.path.join(data_directory, 'Population data', 'nst-est2016-01.xlsx')
pop2 = pd.read_excel(path, header=3, parse_cols='A, D:J', skip_footer=7)

In [15]:
pop2.head()


Out[15]:
2010 2011 2012 2013 2014 2015 2016
United States 309348193 311663358 313998379 316204908 318563456 320896618 323127513
Northeast 55388056 55632766 55829059 55988771 56116791 56184737 56209510
Midwest 66978602 67153331 67332320 67543948 67726368 67838387 67941429
South 114863114 116061801 117299171 118424320 119696311 121039206 122319574
West 72118421 72815460 73537829 74247869 75023986 75834288 76657000

In [16]:
pop2.tail()


Out[16]:
2010 2011 2012 2013 2014 2015 2016
.Virginia 8025773 8110035 8192048 8262692 8317372 8367587 8411808
.Washington 6743226 6822520 6895226 6968006 7054196 7160290 7288000
.West Virginia 1854230 1854972 1856560 1853231 1848514 1841053 1831102
.Wisconsin 5690263 5709640 5726177 5742854 5758377 5767891 5778708
.Wyoming 564513 567725 576765 582684 583642 586555 585501

In [17]:
drop_rows = ['Northeast', 'Midwest', 'South', 'West']
pop2.drop(drop_rows, inplace=True)

In [19]:
pop2.index = pop2.index.str.strip('.')

In [20]:
pop2.head()


Out[20]:
2010 2011 2012 2013 2014 2015 2016
United States 309348193 311663358 313998379 316204908 318563456 320896618 323127513
Alabama 4785492 4799918 4815960 4829479 4843214 4853875 4863300
Alaska 714031 722713 731089 736879 736705 737709 741894
Arizona 6408312 6467163 6549634 6624617 6719993 6817565 6931071
Arkansas 2921995 2939493 2950685 2958663 2966912 2977853 2988248

In [21]:
pop2.columns


Out[21]:
Int64Index([2010, 2011, 2012, 2013, 2014, 2015, 2016], dtype='int64')

In [22]:
pop2_tidy = pd.melt(pop2.reset_index(), id_vars='index',
                    value_vars=range(2010, 2017), value_name='Population',
                    var_name='Year')
pop2_tidy.columns = ['State', 'Year', 'Population']

Combine data


In [23]:
pop_total = pd.concat([pop1_tidy, pop2_tidy])

The overlapping 2010 values are different, but just barely. I'm going to re-combine the datasets and keep values from the second dataset.


In [24]:
pop_total.loc[pop_total['Year']==2010].sort_values('State')


Out[24]:
State Year Population
1 Alabama 2010 4785492
521 Alabama 2010 4785298
522 Alaska 2010 713985
2 Alaska 2010 714031
523 Arizona 2010 6413737
3 Arizona 2010 6408312
524 Arkansas 2010 2921606
4 Arkansas 2010 2921995
525 California 2010 37349363
5 California 2010 37332685
526 Colorado 2010 5049071
6 Colorado 2010 5048644
527 Connecticut 2010 3577073
7 Connecticut 2010 3579899
528 Delaware 2010 899769
8 Delaware 2010 899816
529 District of Columbia 2010 604453
9 District of Columbia 2010 605183
530 Florida 2010 18843326
10 Florida 2010 18849098
11 Georgia 2010 9713521
531 Georgia 2010 9712587
532 Hawaii 2010 1363621
12 Hawaii 2010 1363945
13 Idaho 2010 1571010
533 Idaho 2010 1571450
14 Illinois 2010 12841578
534 Illinois 2010 12843166
15 Indiana 2010 6490528
535 Indiana 2010 6490621
... ... ... ...
558 Oregon 2010 3838957
38 Oregon 2010 3838048
39 Pennsylvania 2010 12712343
559 Pennsylvania 2010 12709630
40 Rhode Island 2010 1053337
560 Rhode Island 2010 1052886
41 South Carolina 2010 4635943
561 South Carolina 2010 4636312
42 South Dakota 2010 816325
562 South Dakota 2010 816463
43 Tennessee 2010 6356671
563 Tennessee 2010 6356897
564 Texas 2010 25257114
44 Texas 2010 25244310
0 United States 2010 309348193
520 United States 2010 309349689
45 Utah 2010 2775326
565 Utah 2010 2776469
566 Vermont 2010 625960
46 Vermont 2010 625982
47 Virginia 2010 8025773
567 Virginia 2010 8024617
48 Washington 2010 6743226
568 Washington 2010 6744496
49 West Virginia 2010 1854230
569 West Virginia 2010 1853973
50 Wisconsin 2010 5690263
570 Wisconsin 2010 5691047
571 Wyoming 2010 564460
51 Wyoming 2010 564513

104 rows × 3 columns


In [25]:
pop_total = pd.concat([pop1_tidy.loc[pop1_tidy['Year'] < 2010], pop2_tidy])

In [26]:
pop_total.head()


Out[26]:
State Year Population
0 United States 2000 282162411
1 Alabama 2000 4452173
2 Alaska 2000 627963
3 Arizona 2000 5160586
4 Arkansas 2000 2678588

In [27]:
pop_total.tail()


Out[27]:
State Year Population
359 Virginia 2016 8411808
360 Washington 2016 7288000
361 West Virginia 2016 1831102
362 Wisconsin 2016 5778708
363 Wyoming 2016 585501

In [81]:
path = os.path.join('Data storage', 'Derived data', 'State population.csv')
pop_total.to_csv(path, index=False)